專案說明

  1. 對經營現況進行說明。
  2. 對選擇的目標客群進行預測,像是每位顧客的回購機率、預期營收、預期獲利以及終生價值。
  3. 利用行銷工具說明如何提升或是維持客群的特點,並瞭解透過此工具的所得到預期的總效益。

資料彙整流程

Fig-1:交易資料彙整
Fig-1:交易資料彙整

1.交易項目紀錄:Z

rm(list=ls(all=T))
knitr::opts_chunk$set(paged.print=FALSE, comment = NA)
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr, vcd,plotly,tidyr,latex2exp,Matrix)
讀進資料
Z = read_csv("data/ta_feng_all_months_merged.csv") %>% 
  data.frame %>% setNames(c(
    "date","cust","age","area","cat","prod","qty","cost","price"))
Rows: 817741 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): TRANSACTION_DT, CUSTOMER_ID, AGE_GROUP, PIN_CODE, PRODUCT_ID
dbl (4): PRODUCT_SUBCLASS, AMOUNT, ASSET, SALES_PRICE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(Z)
[1] 817741
tibble(Z)
# A tibble: 817,741 × 9
   date      cust     age   area       cat prod            qty  cost price
   <chr>     <chr>    <chr> <chr>    <dbl> <chr>         <dbl> <dbl> <dbl>
 1 11/1/2000 01104905 45-49 115     110411 4710199010372     2    24    30
 2 11/1/2000 00418683 45-49 115     120107 4710857472535     1    48    46
 3 11/1/2000 01057331 35-39 115     100407 4710043654103     2   142   166
 4 11/1/2000 01849332 45-49 Others  120108 4710126092129     1    32    38
 5 11/1/2000 01981995 50-54 115     100205 4710176021445     1    14    18
 6 11/1/2000 01741797 35-39 115     110122 0078895770025     1    54    75
 7 11/1/2000 00308359 60-64 115     110507 4710192225520     1    85   105
 8 11/1/2000 01607000 35-39 221     520503 4712936888817     1    45    68
 9 11/1/2000 01057331 35-39 115     320203 4715398106864     2    70    78
10 11/1/2000 00236645 35-39 Unknown 120110 4710126091870     1    43    53
# ℹ 817,731 more rows
日期格式轉換
Z$date = as.Date(Z$date, format="%m/%d/%Y")
par(cex=0.8)
hist(Z$date,'weeks',freq=T,las=2)

年齡層級、郵遞區號

瞭解顧客的年齡客群與分布地區

age.group = c("<25","25-29","30-34","35-39","40-44",
              "45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)
Fig-2:郵遞區號
Fig-2:郵遞區號
par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area,useNA='ifany') %>% barplot(main="Areas", las=2)

經由上圖可以發現顧客年齡多為29-44為主,而區域則是z115(南港)與z221(汐止)為主。

處理離群值
# Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
       qty   cost   price
99%      6  858.0 1014.00
99.9%   14 2722.0 3135.82
99.95%  24 3799.3 3999.00
# Remove Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000) 
nrow(Z)  
[1] 817182
彙總訂單

把每一天、每一個顧客的交易項目彙總為一張訂單

Z$tid = group_indices(Z, date, cust) # same customer same day
Warning: The `...` argument of `group_indices()` is deprecated as of dplyr 1.0.0.
ℹ Please `group_by()` first
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
資料總覽
# No. cust, cat, prod, tid
sapply(Z[c("cust","cat","prod","tid")], n_distinct)
  cust    cat   prod    tid 
 32256   2007  23789 119422 

2.交易紀錄:X

交易資料彙整
X = Z %>% group_by(tid) %>% summarise(
  date = min(date),          # 交易日期  
  cust = min(cust),          # 顧客 ID
  age = min(age),            # 顧客 年齡級別
  area = min(area),          # 顧客 居住區別
  items = n(),               # 交易項目(總)數
  pieces = sum(qty),         # 產品(總)件數
  total = sum(price),        # 交易(總)金額
  gross = sum(price - cost)  # 毛利
) %>% data.frame
nrow(X)
[1] 119422
處理離群值
# Check Quantile & Remove Outliers
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
       items   pieces     total    gross
99.9%     54  81.0000  9009.579 1824.737
99.95%    62  94.2895 10611.579 2179.817
99.99%    82 133.0000 16044.401 3226.548
# Remove Outliers
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
交易摘要
summary(X) 
      tid              date                cust               age           
 Min.   :     1   Min.   :2000-11-01   Length:119328      Length:119328     
 1st Qu.: 29855   1st Qu.:2000-11-29   Class :character   Class :character  
 Median : 59705   Median :2001-01-01   Mode  :character   Mode  :character  
 Mean   : 59712   Mean   :2000-12-31                                        
 3rd Qu.: 89581   3rd Qu.:2001-02-02                                        
 Max.   :119422   Max.   :2001-02-28                                        
     area               items            pieces           total        
 Length:119328      Min.   : 1.000   Min.   : 1.000   Min.   :    5.0  
 Class :character   1st Qu.: 2.000   1st Qu.: 3.000   1st Qu.:  227.0  
 Mode  :character   Median : 5.000   Median : 6.000   Median :  510.0  
                    Mean   : 6.802   Mean   : 9.222   Mean   :  851.6  
                    3rd Qu.: 9.000   3rd Qu.:12.000   3rd Qu.: 1080.0  
                    Max.   :62.000   Max.   :94.000   Max.   :15345.0  
     gross        
 Min.   :-1645.0  
 1st Qu.:   21.0  
 Median :   68.0  
 Mean   :  130.9  
 3rd Qu.:  168.0  
 Max.   : 3389.0  
每周交易次數
par(cex=0.8)
hist(X$date, "weeks", freq=T, las=2, main="No. Transaction per Week")

3.顧客資料:A

顧客資料彙整
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% group_by(cust) %>% summarise(
    TID = min(tid),
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = min(age),     # age group
    area = min(area),   # area code
  ) %>% data.frame      
nrow(A) # 32241
[1] 32241
par(mfrow=c(1,2),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2)    

顧客摘要
summary(A) 
     cust                TID               r                s         
 Length:32241       Min.   :     1   Min.   :  1.00   Min.   :  1.00  
 Class :character   1st Qu.: 11363   1st Qu.:  9.00   1st Qu.: 56.00  
 Mode  :character   Median : 29799   Median : 26.00   Median : 92.00  
                    Mean   : 39261   Mean   : 37.45   Mean   : 80.78  
                    3rd Qu.: 61602   3rd Qu.: 60.00   3rd Qu.:110.00  
                    Max.   :119422   Max.   :120.00   Max.   :120.00  
       f                m                rev              raw         
 Min.   : 1.000   Min.   :    8.0   Min.   :     8   Min.   : -784.0  
 1st Qu.: 1.000   1st Qu.:  365.0   1st Qu.:   707   1st Qu.:   75.0  
 Median : 2.000   Median :  705.7   Median :  1750   Median :  241.0  
 Mean   : 3.701   Mean   :  993.1   Mean   :  3152   Mean   :  484.6  
 3rd Qu.: 4.000   3rd Qu.: 1291.0   3rd Qu.:  3968   3rd Qu.:  612.0  
 Max.   :85.000   Max.   :12636.0   Max.   :127686   Max.   :20273.0  
     age                area          
 Length:32241       Length:32241      
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m')) 
  hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)

4.Check & Save

將剛剛處理完的資料全部重新儲存。

is.na(Z) %>% colSums
 date  cust   age  area   cat  prod   qty  cost price   tid 
    0     0     0     0     0     0     0     0     0     0 
is.na(X) %>% colSums
   tid   date   cust    age   area  items pieces  total  gross 
     0      0      0      0      0      0      0      0      0 
is.na(A) %>% colSums
cust  TID    r    s    f    m  rev  raw  age area 
   0    0    0    0    0    0    0    0    0    0 
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")
類別資料的分類統計
mosaic(~area+age, data=A, shade=T)

由於a99(沒有年齡資料的顧客)人數不多,而且特徵很獨特,探索時我們可以考慮濾掉這群顧客

A0 %>% filter(age!="a99") %>%    # 濾掉沒有年齡資料的顧客('a99')
  group_by(age) %>% summarise(
  TID = min(TID),
  Group.Size = n(),              # 族群人數
  avg.Freq = mean(f),            # 平均購買次數
  avg.Revenue = sum(f*m)/sum(f)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=age)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

經上面的泡泡圖顯示,平均客單價較高的客群為a34與a39。

地理區隔特徵
A0 %>% filter(age!="a99") %>%    # 濾掉沒有年齡資料的顧客('a99')
  group_by(area) %>% summarise(
  TID = min(TID),
  Group.Size = n(),              # 族群人數
  avg.Freq = mean(f),            # 平均購買次數
  avg.Revenue = sum(f*m)/sum(f)  # 平均客單價
  ) %>% 
  ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
  geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
  geom_text(aes(label=area)) +
  scale_size(range=c(5,25)) +
  theme_bw() + theme(legend.position="none") +
  ggtitle("地理區隔特徵 (泡泡大小:族群人數)") + 
  ylab("平均購買次數") + xlab("平均客單價")

故綜上面顯示的兩張泡泡圖分析,我們將年齡與地區作為我們的TA篩選條件。 首先,我們先選出了a34與a39的合併組合,因為從第一張泡泡圖可看出此組合的平均客單價為最高,故此組合為一個可利用行銷而提高收益的對象。

再來,我們依地區分布再選出了z115與z221,而理由是z115在上述的年齡組合的消費人數較少,我們希望利用行銷手法來拓展潛在客群,並且提升在Z115的消費人數;z221地區則在上述年齡的組合的消費人數較高,故我們希望將此常客族群繼續留住,再提高他們對TaFeng的忠誠度。

z221_cu = subset(A0, area == "z221" & (age == "a34" | age == "a39"))#3653人
z115_cu =subset(A0, area == "z115" &(age == "a34" | age == "a39"))#3550人

這裡將兩群TA篩選出來,而可看到兩群人數為總顧客的十分之一,有達到在行銷人數條件的標準


資料整理

Fig-1:交易資料彙整
Fig-1:交易資料彙整

1.製作預測變數

  1. 資料備註:
feb01 = as.Date("2001-02-01")   # 資料分割日期 
Z = subset(Z0, date < feb01)    # 618212 項目
1.重新匯整交易紀錄
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # date of transaction
  cust = first(cust),  # customer id
  age = first(age),    # age group
  area = first(area),  # area group
  items = n(),                # number of items
  pieces = sum(qty),          # number of pieces
  total = sum(price),         # total amount
  gross = sum(price - cost)   # raw profit
  ) %>% data.frame  # 88387 交易筆數
summary(X)
      tid             date                cust               age           
 Min.   :    1   Min.   :2000-11-01   Length:88387       Length:88387      
 1st Qu.:22098   1st Qu.:2000-11-23   Class :character   Class :character  
 Median :44194   Median :2000-12-12   Mode  :character   Mode  :character  
 Mean   :44194   Mean   :2000-12-15                                        
 3rd Qu.:66291   3rd Qu.:2001-01-12                                        
 Max.   :88387   Max.   :2001-01-31                                        
     area               items             pieces            total        
 Length:88387       Min.   :  1.000   Min.   :  1.000   Min.   :    5.0  
 Class :character   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  230.0  
 Mode  :character   Median :  5.000   Median :  6.000   Median :  522.0  
                    Mean   :  6.994   Mean   :  9.453   Mean   :  888.7  
                    3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1120.0  
                    Max.   :112.000   Max.   :339.000   Max.   :30171.0  
     gross        
 Min.   :-1645.0  
 1st Qu.:   23.0  
 Median :   72.0  
 Mean   :  138.3  
 3rd Qu.:  174.0  
 Max.   : 8069.0  
移除不合理的離群資料
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
         items   pieces     total    gross
99.9%  56.0000  84.0000  9378.684 1883.228
99.95% 64.0000  98.0000 11261.751 2317.087
99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
重新匯整顧客資料
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    TID = min(tid),
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frequency
    m = mean(total),    # monetary(平均客單價)
    rev = sum(total),   # total revenue contribution(總營收貢獻)
    raw = sum(gross),   # total gross profit contribution(總淨利貢獻)
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584 顧客
nrow(A)
[1] 28584

2.製作預測變數

匯整最後一期的資料
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900
  • feb$amount之中有最後一期來買過的16,900位顧客的營收貢獻
  • 將feb$amount匯入A
A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A\(buy - A\)amount是NA代表這位顧客最後一期沒來買
  • A$amount不是NA代表這位顧客最後一期有來買過
A$buy = !is.na(A$amount)  
table(A$buy, !is.na(A$amount))
       
        FALSE  TRUE
  FALSE 15342     0
  TRUE      0 13242
3. 總結資料集的內容
summary(A)
     cust                TID              r               s        
 Length:28584       Min.   :    1   Min.   : 1.00   Min.   : 1.00  
 Class :character   1st Qu.: 9852   1st Qu.:11.00   1st Qu.:47.00  
 Mode  :character   Median :25162   Median :21.00   Median :68.00  
                    Mean   :31046   Mean   :32.12   Mean   :61.27  
                    3rd Qu.:48834   3rd Qu.:53.00   3rd Qu.:83.00  
                    Max.   :88387   Max.   :92.00   Max.   :92.00  
                                                                   
       f                m                rev             raw         
 Min.   : 1.000   Min.   :    8.0   Min.   :    8   Min.   : -742.0  
 1st Qu.: 1.000   1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0  
 Median : 2.000   Median :  709.5   Median : 1566   Median :  218.0  
 Mean   : 3.089   Mean   : 1012.4   Mean   : 2711   Mean   :  420.8  
 3rd Qu.: 4.000   3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0  
 Max.   :60.000   Max.   :10634.0   Max.   :99597   Max.   :15565.0  
                                                                     
     age                area               amount         buy         
 Length:28584       Length:28584       Min.   :    8   Mode :logical  
 Class :character   Class :character   1st Qu.:  454   FALSE:15342    
 Mode  :character   Mode  :character   Median :  993   TRUE :13242    
                                       Mean   : 1499                  
                                       3rd Qu.: 1955                  
                                       Max.   :28089                  
                                       NA's   :15342                  
#z115_ca在2月的購買人數
z115_cu_A1 = matrix(NA, nrow(z115_cu),ncol(z115_cu))
z115_cu_A1 = A[A$TID %in% z115_cu$TID,]
table(z115_cu_A1$buy)

FALSE  TRUE 
 1462  1808 

依上面結果,z221(汐止)的客群在2月的購買人數為1623。

#z221_cu在2月的購買人數
z221_cu_A1 = matrix(NA, nrow(z221_cu),ncol(z221_cu))
z221_cu_A1 = A[A$TID %in% z221_cu$TID,]
table(z221_cu_A1$buy)

FALSE  TRUE 
 1715  1623 

依上面結果,z221(南港)的客群在2月的購買人數為1808。


建立模型

Fig-3:建立模型
Fig-3:建立模型
1.分割訓練測試(TR)與測試資料(TS)

Data for 購買機率模型與購買金額模型
- A之中每一筆資料都可以拿來做購買機率模型
- 但只有A$amount有值的資料可以拿來做購買金額模型 : A2
- A和A2都需要在相同的目標變數分佈切割成訓練測試(TR)與測試資料(TS)
- 我們分別用caTools::sample.split()和隨機抽樣(sample)來製作分割向量:spl,spl2

z115的購買機率模型

#z115的購買機率模型
z115_X1 = subset(X, cust %in% z115_cu_A1$cust & date < as.Date("2001-02-01"))
z115_Z1 = subset(Z, cust %in% z115_cu_A1$cust & date < as.Date("2001-02-01"))
set.seed(2018)
z115_spl = sample.split(z115_cu_A1$buy, SplitRatio = 0.7)
c(nrow(z115_cu_A1),sum(z115_spl),sum(!z115_spl))
[1] 3270 2289  981
tapply(z115_cu_A1$buy,z115_spl,mean)
    FALSE      TRUE 
0.5524975 0.5530799 
cbind(z115_cu_A1, z115_spl) %>% filter(buy) %>%
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=z115_spl), alpha=0.5)

z115的購買機率金額

z115_A2 = subset(z115_cu_A1, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(z115_A2)
#隨機抽樣製作分割向量spl2
set.seed(2018); z115_spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(z115_A2), sum(z115_spl2), sum(!z115_spl2))
[1] 1808 1266  542
mean(z115_spl2)
[1] 0.7002212
cbind(z115_A2, z115_spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=z115_spl2), alpha=0.5)

z221購買機率模型

z221_X1 = subset(X, cust %in% z221_cu_A1$cust & date < as.Date("2001-02-01"))
z221_Z1 = subset(Z, cust %in% z221_cu_A1$cust & date < as.Date("2001-02-01"))
set.seed(2018)
z221_spl = sample.split(z221_cu_A1$buy, SplitRatio = 0.7)
c(nrow(z221_cu_A1),sum(z221_spl),sum(!z221_spl))
[1] 3338 2336 1002
#TR和TS之中buy==TRUE的比例是一致的
tapply(z221_cu_A1$buy,z221_spl,mean)
    FALSE      TRUE 
0.4860279 0.4863014 
cbind(z221_cu_A1, z221_spl) %>% filter(buy) %>%
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=z221_spl), alpha=0.5)

z221的購買金額模型

z221_A2 = subset(z221_cu_A1, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(z221_A2)
#隨機抽樣製作分割向量spl2
set.seed(2018); z221_spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(z221_A2), sum(z221_spl2), sum(!z221_spl2))
[1] 1623 1136  487
mean(z221_spl2)
[1] 0.6999384
cbind(z221_A2, z221_spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=z221_spl2), alpha=0.5)

2.儲存做好的資料
save(z115_Z1, z115_X1, z115_cu_A1, z115_spl, z115_spl2, file="data/tf_z115.rdata")
save(z221_Z1, z221_X1, z221_cu_A1, z221_spl, z221_spl2, file="data/tf_z221.rdata")

模型與預測

1.匯入剛儲存的資料

load("data/tf_z115.rdata")
load("data/tf_z221.rdata")

使用spl切割機率模型的訓練(TR)與測試資料(TS)

z115_TR = subset(z115_cu_A1,z115_spl)
z115_TS = subset(z115_cu_A1,!z115_spl)
z221_TR = subset(z221_cu_A1,z221_spl)
z221_TS = subset(z221_cu_A1,!z221_spl)

2.購買機率模型

z115目標客群的測試資料預測

z115_glm1 = glm(buy ~ ., z115_TR[,c(3:9,12)], family=binomial()) 
summary(z115_glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = z115_TR[, 
    c(3:9, 12)])

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -7.929e-01  1.620e-01  -4.893 9.91e-07 ***
r           -1.301e-02  2.562e-03  -5.080 3.78e-07 ***
s            7.731e-03  2.630e-03   2.939  0.00329 ** 
f            2.887e-01  4.198e-02   6.878 6.05e-12 ***
m            3.663e-05  8.637e-05   0.424  0.67146    
rev          4.363e-05  5.866e-05   0.744  0.45700    
raw         -3.742e-04  2.513e-04  -1.489  0.13640    
agea39      -3.868e-02  9.391e-02  -0.412  0.68039    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3147.4  on 2288  degrees of freedom
Residual deviance: 2644.2  on 2281  degrees of freedom
AIC: 2660.2

Number of Fisher Scoring iterations: 5
z115_pred = predict(z115_glm1, z115_TS, type = "response")
z115_cm = table(actual = z115_TS$buy, predict = z115_pred > 0.5);z115_cm
       predict
actual  FALSE TRUE
  FALSE   314  125
  TRUE    181  361
z115_acc.ts = z115_cm %>% {sum(diag(.))/sum(.)}
c(1-mean(z115_TS$buy) , z115_acc.ts)  # 0.6880734 
[1] 0.4475025 0.6880734
colAUC(z115_pred, z115_TS$buy)  #辨識率達0.7513722
                    [,1]
FALSE vs. TRUE 0.7513722

z221目標客群的測試資料預測

z221_glm1 = glm(buy ~ ., z221_TR[,c(3:9,12)], family=binomial()) 
summary(z221_glm1)

Call:
glm(formula = buy ~ ., family = binomial(), data = z221_TR[, 
    c(3:9, 12)])

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.133e+00  1.597e-01  -7.097 1.27e-12 ***
r           -1.326e-02  2.533e-03  -5.234 1.66e-07 ***
s            1.163e-02  2.616e-03   4.445 8.79e-06 ***
f            2.682e-01  4.534e-02   5.916 3.30e-09 ***
m            2.365e-05  7.547e-05   0.313    0.754    
rev          5.172e-05  4.865e-05   1.063    0.288    
raw         -3.040e-04  2.235e-04  -1.360    0.174    
agea39      -4.673e-02  9.150e-02  -0.511    0.610    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3236.6  on 2335  degrees of freedom
Residual deviance: 2780.5  on 2328  degrees of freedom
AIC: 2796.5

Number of Fisher Scoring iterations: 5
z221_pred = predict(z221_glm1, z221_TS, type = "response")
z221_cm = table(actual = z221_TS$buy, predict = z221_pred > 0.5);z221_cm
       predict
actual  FALSE TRUE
  FALSE   391  124
  TRUE    205  282
z221_acc.ts = z221_cm %>% {sum(diag(.))/sum(.)}
c(1-mean(z221_TS$buy) , z221_acc.ts)  # 0.6776567 
[1] 0.5139721 0.6716567
colAUC(z221_pred, z221_TS$buy)  #辨識率達0.7369989
                    [,1]
FALSE vs. TRUE 0.7369989

3.購買金額模型

z115客群預測未來購買金額

z115_A2 = subset(z115_cu_A1, z115_cu_A1$buy) %>% mutate_at(c("m","rev","amount"), log10)
z115_TR2 = subset(z115_A2, z115_spl2)
z115_TS2 = subset(z115_A2, !z115_spl2)
z115_lm1 = lm(amount ~ ., z115_TR2[,c(3:9,11)])
summary(z115_lm1)

Call:
lm(formula = amount ~ ., data = z115_TR2[, c(3:9, 11)])

Residuals:
     Min       1Q   Median       3Q      Max 
-2.04482 -0.21936  0.04393  0.27321  1.36192 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.406e+00  1.175e-01  11.964  < 2e-16 ***
r            4.926e-04  8.326e-04   0.592  0.55422    
s           -2.902e-05  8.349e-04  -0.035  0.97228    
f            2.328e-02  5.411e-03   4.302 1.82e-05 ***
m            4.745e-01  1.042e-01   4.552 5.83e-06 ***
rev          2.308e-02  9.836e-02   0.235  0.81452    
raw          6.501e-05  2.048e-05   3.174  0.00154 ** 
agea39       1.138e-02  2.316e-02   0.491  0.62328    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4098 on 1258 degrees of freedom
Multiple R-squared:  0.2777,    Adjusted R-squared:  0.2737 
F-statistic: 69.09 on 7 and 1258 DF,  p-value: < 2.2e-16
z115_r2.tr = summary(z115_lm1)$r.sq
z115_SST = sum((z115_TS2$amount - mean(z115_TR2$amount))^ 2)
z115_SSE = sum((predict(z115_lm1, z115_TS2) -  z115_TS2$amount)^2)
z115_r2.ts = 1 - (z115_SSE/z115_SST)
c(z115_R2train=z115_r2.tr, z115_R2test=z115_r2.ts)#模型可以解釋testing dataset 將近2成的變異
z115_R2train  z115_R2test 
   0.2776940    0.3015492 

z221客群預測未來購買金額

z221_A2 = subset(z221_cu_A1, z221_cu_A1$buy) %>% mutate_at(c("m","rev","amount"), log10)
z221_TR2 = subset(z221_A2, z221_spl2)
z221_TS2 = subset(z221_A2, !z221_spl2)
z221_lm1 = lm(amount ~ ., z221_TR2[,c(3:9,11)])
summary(z221_lm1)

Call:
lm(formula = amount ~ ., data = z221_TR2[, c(3:9, 11)])

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8689 -0.2432  0.0557  0.3053  1.3045 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.439e+00  1.325e-01  10.861  < 2e-16 ***
r           7.535e-04  8.610e-04   0.875 0.381705    
s           9.269e-05  8.755e-04   0.106 0.915699    
f           2.014e-02  6.952e-03   2.897 0.003837 ** 
m           4.231e-01  1.126e-01   3.759 0.000179 ***
rev         5.801e-02  1.051e-01   0.552 0.581162    
raw         6.230e-05  2.670e-05   2.333 0.019816 *  
agea39      1.886e-02  2.527e-02   0.746 0.455802    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4237 on 1128 degrees of freedom
Multiple R-squared:  0.2264,    Adjusted R-squared:  0.2216 
F-statistic: 47.17 on 7 and 1128 DF,  p-value: < 2.2e-16
z221_r2.tr = summary(z221_lm1)$r.sq
z221_SST = sum((z221_TS2$amount - mean(z221_TR2$amount))^ 2)
z221_SSE = sum((predict(z221_lm1, z221_TS2) -  z221_TS2$amount)^2)
z221_r2.ts = 1 - (z221_SSE/z221_SST)
c(z221_R2train=z221_r2.tr, z221_R2test=z221_r2.ts)#模型可以解釋testing dataset 將近2成的變異
z221_R2train  z221_R2test 
   0.2264430    0.2253829 

4.預測未來顧客行為

load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frequency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
    TID = min(tid)
  ) %>% data.frame      # 28531
nrow(B)
[1] 28531
z115_B = matrix(NA, nrow(z115_cu),ncol(z115_cu))
z115_B = B[B$TID %in% z115_cu$TID,]
z221_B = matrix(NA, nrow(z221_cu),ncol(z221_cu))
z221_B = B[B$TID %in% z221_cu$TID,]
3月份來買的機率

z115購買機率

z115_B$Buy = predict(z115_glm1, z115_B, type="response")#3月份來買的機率
z115_B2 = z115_B %>% mutate_at(c("m","rev"), log10)
z115_B$Rev = 10^predict(z115_lm1, z115_B2)
#"m","rev"兩個變數和目標變數取log
#"rev"為預期購買金額
par(mfrow=c(1,2), cex=0.8)
hist(z115_B$Buy)
hist(log(z115_B$Rev,10))

save(z115_B, file='data/tf4_z115.rdata')

z221購買機率

z221_B$Buy = predict(z221_glm1, z221_B, type="response")
z221_B2 = z221_B %>% mutate_at(c("m","rev"), log10)
z221_B$Rev = 10^predict(z221_lm1, z221_B2)
#"m","rev"兩個變數和目標變數取log
#"rev"為預期購買金額
par(mfrow=c(1,2), cex=0.8)
hist(z221_B$Buy)
hist(log(z221_B$Rev,10))

save(z221_B, file='data/tf4_z221.rdata')

顧客終身價值計算

接著透過計算顧客終生價值瞭解每一個顧客的潛在價值有多大。 Fig-3:CLV

z115客群的未來3年的終身價值
g = 0.3   # margin
N = 36    # period(一期一個月)
d = 0.01  # interest rate
z115_B$CLV = g * z115_B$Rev * rowSums(sapply(
  0:N, function(i) (z115_B$Buy/(1+d))^i ) )

summary(z115_B$CLV)#該顧客的終生價值,未來3年的營收貢獻
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   35.9   291.3   455.7   617.7   696.3 14799.0 
ggplot(z115_B, aes(CLV)) + 
  geom_histogram(bins=30, fill="green",alpha=0.6) + 
  scale_x_log10()

z221客群的未來3年的終身價值
g = 0.3   # margin
N = 36    # period(一期一個月)
d = 0.01  # interest rate
z221_B$CLV = g * z221_B$Rev * rowSums(sapply(
  0:N, function(i) (z221_B$Buy/(1+d))^i ) )

summary(z221_B$CLV)#該顧客的終生價值,未來3年的營收貢獻
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  55.21  280.14  430.81  564.34  666.77 8015.69 
ggplot(z221_B, aes(CLV)) + 
  geom_histogram(bins=30, fill="green",alpha=0.6) + 
  scale_x_log10()

帶參數的假設

S曲線
- S-Curve : 許多管理工具都呈現S型的成本效益函數
- 以R內建的邏輯式函數(plogis())來模擬S曲線 \[\Delta P(x|m,b,a) = m \cdot Logis(\frac{10(x - b)}{a})\]

DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
par(mar=c(4,4,2,1),cex=0.7)
curve(DP(x,m=0.20,b=30,a=40), 0, 60, lwd=2, ylim=c(0, 0.25),
      main="F( x | m=0.2, b=30, a=40 )", ylab="delta P")
abline(h=seq(0,0.2,0.05),v=seq(0,60,5),col='lightgrey',lty=2)

Fig-44:CLV 透過這3個parameters(參數):

估計預期獲利

有了行銷工具的成本效益函數之後,我們就可以估計將這個工具用在每一位顧客上的時候的預期效益:

\[\hat{R}(x) = \left\{\begin{matrix} \Delta P \cdot M \cdot margin - x & , & P + \Delta P \leq 1\\ (1-P) \cdot M \cdot margin - x & , & else \end{matrix}\right.\]

結合以下, - 預測 (\(P, M\)) : 每位顧客的預期購買機率和購買金額, - 假設 (\(\Delta P(x|m,b,a)\)) : 行銷工具帶來的再購機率增額 我們就可以估計這個工具用在每位顧客上的預期效益 \(\hat{R}(x)\)

🌻 注意\(\Delta P\)\(\hat{R}\) 都是藉由函數 \(x\)\(m,b,a\) 所得到

  • \(P, M\) 預期購買機率和金額,是預測
  • \(m, b, a\) 行銷工具的屬性,是假設
  • \(x\) 行銷強度,是我們可以操作的、想要優化的策略變數
估計毛利率 m
margin = 0.17  # assume margin = 0.17
估計每位顧客的淨收益

z221客群的淨收益

#z115族客群的示意圖(改mba)
m=0.25; b=20; a=30; x=27
z115_dp = pmin(1-z115_B$Buy, DP(x,m,b,a))
z115_eR = z115_dp*z115_B$Rev*margin - x
hist(z115_eR,main="TA1預期淨收益分佈(m=0.20; b=20; a=30;x=27)",xlab="z115_a3439預期淨收益",ylab="z115_a3439顧客人數")

z221客群的淨收益

#z221族群的示意圖(改mba)
m=0.2; b=20; a=30; x=27
z221_dp = pmin(1-z221_B$Buy, DP(x,m,b,a))
z221_eR = z221_dp*z221_B$Rev*margin - x
hist(z221_eR,main="TA2預期淨收益分佈(m=0.20; b=20; a=30;x=27)",xlab="z221_a3439預期淨收益",ylab="z221_a3439顧客人數")

市場模擬

一個行銷工具給定工具參數(m,b,a),我們可在其有效成本範圍(x∈[b−a2,b+a2] )之內,估計工具的效果: - eReturn : 對所有的人行銷的總預期收益 - N : 預期收益大於零的人數 - eReturn2 : 只對期收益大於零的人做行銷的總預期收益如何隨成本變化。

基本折價券、累點行銷工具模擬
mm=c(0.15, 0.25)
bb=c(  10,   25)
aa=c(  20,   30)
X = seq(0,60,2) 
do.call(rbind, lapply(1:length(mm), function(i) data.frame(
  Inst = ifelse(i == 1, "折價券", "累點"), Cost=X, 
  Gain=DP(X,mm[i],bb[i],aa[i])
  ))) %>% data.frame %>% 
  ggplot(aes(x=Cost, y=Gain, col=Inst)) +
  geom_line(size=1.5,alpha=0.5) + theme_bw() +
  ggtitle("Prob. Function: f(x|m,b,a)")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.

工具模擬對於z115的效果
mm=c(0.2, 0.25)
bb=c(  20,   20)
aa=c(  30,   20)
X = seq(0,60,2) 
do.call(rbind, lapply(1:length(mm), function(i) data.frame(
  Inst = ifelse(i == 1, "折價券", "累點"), Cost=X, 
  Gain=DP(X,mm[i],bb[i],aa[i])
  ))) %>% data.frame %>% 
  ggplot(aes(x=Cost, y=Gain, col=Inst)) +
  geom_line(size=1.5,alpha=0.5) + theme_bw() +
  ggtitle("Prob. Function: f(x|m,b,a)")

X = seq(10, 60, 1) 
df = do.call(rbind, lapply(1:length(mm), function(i) {
  sapply(X, function(x) {
    z115_dp = pmin(1-z115_B$Buy, DP(x,mm[i],bb[i],aa[i]))
    z115_eR = z115_dp*z115_B$Rev*margin - x
    c(i=i, x=x, z115_eR.ALL=sum(z115_eR), N=sum(z115_eR>0), z115_eR.SEL=sum(z115_eR[z115_eR > 0]) )
    }) %>% t %>% data.frame
  })) 

df %>% 
  mutate_at(vars(z115_eR.ALL, z115_eR.SEL), function(y) round(y/1000)) %>% 
  gather('key','value',-i,-x) %>% 
  mutate(Inst = ifelse(i == 1, "折價券", "累點")) %>%
  ggplot(aes(x=x, y=value, col=Inst)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期收益($K)') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw() -> p

plotly::ggplotly(p)
group_by(df, i) %>% top_n(1,z115_eR.SEL)
# A tibble: 2 × 5
# Groups:   i [2]
      i     x z115_eR.ALL     N z115_eR.SEL
  <dbl> <dbl>       <dbl> <dbl>       <dbl>
1     1    27        418.   623       7757.
2     2    26      13359.   962      17144.
工具模擬對於z221的效果
mm=c(0.25, 0.2)
bb=c(  15,   25)
aa=c(  20,   30)
X = seq(0,60,2) 
do.call(rbind, lapply(1:length(mm), function(i) data.frame(
  Inst = ifelse(i == 1, "折價券", "累點"), Cost=X, 
  Gain=DP(X,mm[i],bb[i],aa[i])
  ))) %>% data.frame %>% 
  ggplot(aes(x=Cost, y=Gain, col=Inst)) +
  geom_line(size=1.5,alpha=0.5) + theme_bw() +
  ggtitle("Prob. Function: f(x|m,b,a)")

X = seq(10, 60, 1) 
df = do.call(rbind, lapply(1:length(mm), function(i) {
  sapply(X, function(x) {
    z221_dp = pmin(1-z221_B$Buy, DP(x,mm[i],bb[i],aa[i]))
    z221_eR = z221_dp*z221_B$Rev*margin - x
    c(i=i, x=x, z221_eR.ALL=sum(z221_eR), N=sum(z221_eR>0), z221_eR.SEL=sum(z221_eR[z221_eR > 0]) )
    }) %>% t %>% data.frame
  })) 

df %>% 
  mutate_at(vars(z221_eR.ALL, z221_eR.SEL), function(y) round(y/1000)) %>% 
  gather('key','value',-i,-x) %>% 
  mutate(Inst = ifelse(i == 1, "折價券", "累點")) %>%
  ggplot(aes(x=x, y=value, col=Inst)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期收益($K)') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw() -> p

plotly::ggplotly(p)

最後結果

根據預期的總效益顯示,有了以下的行銷策略:
1. 集點加購送鍋具適用於z115(南港)的客群,總觸及人數有962人,預期效益為17144。 2. 滿千送95折折價券(規定於下次使用)則適用於z221的客群(汐止區),總觸及會有1352人,預期效益30366。
故總觸及人數會有2314人,總結效益為47510。